PC Open 101

home *** CD-ROM | disk | FTP | other *** search

/ PC Open 101 / PC Open 101 CD 1.bin / CD1 / INTERNET / EMAIL / pop file / setup.exe / Classifier / Bayes.pm next >

Wrap

Perl POD Document | 2004-09-22 | 128.5 KB | 3,856 lines

# POPFILE LOADABLE MODULE package Classifier::Bayes; use POPFile::Module; @ISA = ("POPFile::Module"); #---------------------------------------------------------------------------- # # Bayes.pm --- Naive Bayes text classifier # # Copyright (c) 2001-2004 John Graham-Cumming # # This file is part of POPFile # # POPFile is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # POPFile is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with POPFile; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Modified by Sam Schinke (sschinke@users.sourceforge.net) # Merged with db code from Scott Leighton (helphand@users.sourceforge.net) # #---------------------------------------------------------------------------- use strict; use warnings; use locale; use Classifier::MailParse; use IO::Handle; use DBI; use Digest::MD5 qw( md5_hex ); use MIME::Base64; # This is used to get the hostname of the current machine # in a cross platform way use Sys::Hostname; # A handy variable containing the value of an EOL for networks my $eol = "\015\012"; # Korean characters definition my $ksc5601_sym = '(?:[\xA1-\xAC][\xA1-\xFE])'; my $ksc5601_han = '(?:[\xB0-\xC8][\xA1-\xFE])'; my $ksc5601_hanja = '(?:[\xCA-\xFD][\xA1-\xFE])'; my $ksc5601 = "(?:$ksc5601_sym|$ksc5601_han|$ksc5601_hanja)"; my $eksc = "(?:$ksc5601|[\x81-\xC6][\x41-\xFE])"; #extended ksc #---------------------------------------------------------------------------- # new # # Class new() function #---------------------------------------------------------------------------- sub new { my $type = shift; my $self = POPFile::Module->new(); # Set this to 1 to get scores for individual words in message detail $self->{wordscores__} = 0; # Choice for the format of the "word matrix" display. $self->{wmformat__} = ''; # Just our hostname $self->{hostname__} = ''; # File Handle for DBI database $self->{db__} = {}; $self->{history__} = 0; # To save time we also 'prepare' some commonly used SQL statements # and cache them here, see the function db_connect__ for details $self->{db_get_buckets__} = 0; $self->{db_get_wordid__} = 0; $self->{db_get_word_count__} = 0; $self->{db_put_word_count__} = 0; $self->{db_get_bucket_unique_counts__} = 0; $self->{db_get_unique_word_count__} = 0; $self->{db_get_bucket_word_counts__} = 0; $self->{db_get_full_total__} = 0; $self->{db_get_bucket_parameter__} = 0; $self->{db_set_bucket_parameter__} = 0; $self->{db_get_bucket_parameter_default__} = 0; $self->{db_get_buckets_with_magnets__} = 0; $self->{db_delete_zero_words__} = 0; # Caches the name of each bucket and relates it to both the bucket # ID in the database and whether it is pseudo or not # # Subkeys used are: # # id The bucket ID in the database # pseudo 1 if this is a pseudo bucket $self->{db_bucketid__} = {}; # Caches the IDs that map to parameter types $self->{db_parameterid__} = {}; # Caches looked up parameter values on a per bucket basis $self->{db_parameters__} = {}; # Used to parse mail messages $self->{parser__} = new Classifier::MailParse; # The possible colors for buckets $self->{possible_colors__} = [ 'red', 'green', 'blue', 'brown', # PROFILE BLOCK START 'orange', 'purple', 'magenta', 'gray', 'plum', 'silver', 'pink', 'lightgreen', 'lightblue', 'lightcyan', 'lightcoral', 'lightsalmon', 'lightgrey', 'darkorange', 'darkcyan', 'feldspar', 'black' ]; # PROFILE BLOCK STOP # Precomputed per bucket probabilities $self->{bucket_start__} = {}; # A very unlikely word $self->{not_likely__} = {}; # The expected corpus version # # DEPRECATED This is only used when upgrading old flat file corpus files # to the database $self->{corpus_version__} = 1; # The unclassified cutoff this value means that the top # probabilily must be n times greater than the second probability, # default is 100 times more likely $self->{unclassified__} = log(100); # Used to tell the caller whether a magnet was used in the last # mail classification $self->{magnet_used__} = 0; $self->{magnet_detail__} = 0; # This maps session keys (long strings) to user ids. If there's # an entry here then the session key is valid and can be used in # the POPFile API. See the methods get_session_key and # release_session_key for details $self->{api_sessions__} = {}; # Must call bless before attempting to call any methods bless $self, $type; $self->name( 'bayes' ); return $self; } #---------------------------------------------------------------------------- # # forked # # This is called inside a child process that has just forked, since # the child needs access to the database we open it # #---------------------------------------------------------------------------- sub forked { my ( $self ) = @_; $self->db_connect__(); } #---------------------------------------------------------------------------- # # initialize # # Called to set up the Bayes module's parameters # #---------------------------------------------------------------------------- sub initialize { my ( $self ) = @_; # This is the name for the database $self->config_( 'database', 'popfile.db' ); # This is the 'connect' string used by DBI to connect to the # database, if you decide to change from using SQLite to some # other database (e.g. MySQL, Oracle, ... ) this *should* be all # you need to change. The additional parameters user and auth are # needed for some databases. # # Note that the dbconnect string will be interpolated before being # passed to DBI and the variable $dbname can be used within it and # it resolves to the full path to the database named in the # database parameter above. $self->config_( 'dbconnect', 'dbi:SQLite:dbname=$dbname' ); $self->config_( 'dbuser', '' ); $self->config_( 'dbauth', '' ); # SQLite 1.05+ have some problems we are resolving. # This lets us give a nice message and then disable # the version checking later $self->config_( 'bad_sqlite_version', '3.0.0' ); # No default unclassified weight is the number of times more sure POPFile # must be of the top class vs the second class, default is 100 times more $self->config_( 'unclassified_weight', 100 ); # The corpus is kept in the 'corpus' subfolder of POPFile # # DEPRECATED This is only used to find an old corpus that might # need to be upgraded $self->config_( 'corpus', 'corpus' ); # The characters that appear before and after a subject modification $self->config_( 'subject_mod_left', '[' ); $self->config_( 'subject_mod_right', ']' ); # Get the hostname for use in the X-POPFile-Link header $self->{hostname__} = hostname; # Allow the user to override the hostname $self->config_( 'hostname', $self->{hostname__} ); # If set to 1 then the X-POPFile-Link will have < > around the # URL (i.e. X-POPFile-Link: <http://foo.bar>) when set to 0 there # are none (i.e. X-POPFile-Link: http://foo.bar) $self->config_( 'xpl_angle', 0 ); $self->mq_register_( 'COMIT', $self ); $self->mq_register_( 'RELSE', $self ); return 1; } #---------------------------------------------------------------------------- # # deliver # # Called by the message queue to deliver a message # # There is no return value from this method # #---------------------------------------------------------------------------- sub deliver { my ( $self, $type, @message ) = @_; if ( $type eq 'COMIT' ) { $self->classified( $message[0], $message[2] ); } if ( $type eq 'RELSE' ) { $self->release_session_key_private__( $message[0] ); } } #---------------------------------------------------------------------------- # # start # # Called to start the Bayes module running # #---------------------------------------------------------------------------- sub start { my ( $self ) = @_; # In Japanese mode, explicitly set LC_COLLATE to C. This is to # avoid Perl crash on Windows because default LC_COLLATE of # Japanese Win is Japanese_Japan.932(Shift_JIS), which is # different from the charset POPFile uses for Japanese # characters(EUC-JP). if ( defined( $self->module_config_( 'html', 'language' ) ) && ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) ) { use POSIX qw( locale_h ); setlocale( LC_COLLATE, 'C' ); } # Pass in the current interface language for language specific parsing $self->{parser__}->{lang__} = $self->module_config_( 'html', 'language' ); $self->{unclassified__} = log( $self->config_( 'unclassified_weight' ) ); if ( !$self->db_connect__() ) { return 0; } $self->upgrade_predatabase_data__(); return 1; } #---------------------------------------------------------------------------- # # stop # # Called when POPFile is terminating # #---------------------------------------------------------------------------- sub stop { my ( $self ) = @_; $self->db_disconnect__(); } #---------------------------------------------------------------------------- # # classified # # Called to inform the module about a classification event # # There is no return value from this method # #---------------------------------------------------------------------------- sub classified { my ( $self, $session, $class ) = @_; $self->set_bucket_parameter( $session, $class, 'count', # PROFILE BLOCK START $self->get_bucket_parameter( $session, $class, 'count' ) + 1 ); # PROFILE BLOCK STOP } #---------------------------------------------------------------------------- # # reclassified # # Called to inform the module about a reclassification from one bucket # to another # # session Valid API session # bucket The old bucket name # newbucket The new bucket name # undo 1 if this is an undo operation # # There is no return value from this method # #---------------------------------------------------------------------------- sub reclassified { my ( $self, $session, $bucket, $newbucket, $undo ) = @_; $self->log_( 0, "Reclassification from $bucket to $newbucket" ); my $c = $undo?-1:1; if ( $bucket ne $newbucket ) { my $count = $self->get_bucket_parameter( $session, $newbucket, 'count' ); my $newcount = $count + $c; $newcount = 0 if ( $newcount < 0 ); $self->set_bucket_parameter( $session, $newbucket, 'count', $newcount ); $count = $self->get_bucket_parameter( $session, $bucket, 'count' ); $newcount = $count - $c; $newcount = 0 if ( $newcount < 0 ); $self->set_bucket_parameter( $session, $bucket, 'count', $newcount ); my $fncount = $self->get_bucket_parameter( $session, $newbucket, 'fncount' ); my $newfncount = $fncount + $c; $newfncount = 0 if ( $newfncount < 0 ); $self->set_bucket_parameter( $session, $newbucket, 'fncount', $newfncount ); my $fpcount = $self->get_bucket_parameter( $session, $bucket, 'fpcount' ); my $newfpcount = $fpcount + $c; $newfpcount = 0 if ( $newfpcount < 0 ); $self->set_bucket_parameter( $session, $bucket, 'fpcount', $newfpcount ); } } #---------------------------------------------------------------------------- # # get_color # # Retrieves the color for a specific word, color is the most likely bucket # # $session Session key returned by get_session_key # $word Word to get the color of # #---------------------------------------------------------------------------- sub get_color { my ( $self, $session, $word ) = @_; my $max = -10000; my $color = 'black'; for my $bucket ($self->get_buckets( $session )) { my $prob = $self->get_value_( $session, $bucket, $word ); if ( $prob != 0 ) { if ( $prob > $max ) { $max = $prob; $color = $self->get_bucket_parameter( $session, $bucket, 'color' ); } } } return $color; } #---------------------------------------------------------------------------- # # get_not_likely_ # # Returns the probability of a word that doesn't appear # #---------------------------------------------------------------------------- sub get_not_likely_ { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); return $self->{not_likely__}{$userid}; } #---------------------------------------------------------------------------- # # get_value_ # # Returns the value for a specific word in a bucket. The word is # converted to the log value of the probability before return to get # the raw value just hit the hash directly or call get_base_value_ # #---------------------------------------------------------------------------- sub get_value_ { my ( $self, $session, $bucket, $word ) = @_; my $value = $self->db_get_word_count__( $session, $bucket, $word ); if ( defined( $value ) && ( $value > 0 ) ) { # Profiling notes: # # I tried caching the log of the total value and then doing # log( $value ) - $cached and this turned out to be # much slower than this single log with a division in it return log( $value / $self->get_bucket_word_count( $session, $bucket ) ); } else { return 0; } } sub get_base_value_ { my ( $self, $session, $bucket, $word ) = @_; my $value = $self->db_get_word_count__( $session, $bucket, $word ); if ( defined( $value ) ) { return $value; } else { return 0; } } #---------------------------------------------------------------------------- # # set_value_ # # Sets the value for a word in a bucket and updates the total word # counts for the bucket and globally # #---------------------------------------------------------------------------- sub set_value_ { my ( $self, $session, $bucket, $word, $value ) = @_; if ( $self->db_put_word_count__( $session, $bucket, $word, $value ) == 1 ) { # If we set the word count to zero then clean it up by deleting the # entry my $userid = $self->valid_session_key__( $session ); my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; $self->{db_delete_zero_words__}->execute( $bucketid ); return 1; } else { return 0; } } #---------------------------------------------------------------------------- # # get_sort_value_ behaves the same as get_value_, except that it # returns not_likely__ rather than 0 if the word is not found. This # makes its result more suitable as a sort key for bucket ranking. # #---------------------------------------------------------------------------- sub get_sort_value_ { my ( $self, $session, $bucket, $word ) = @_; my $v = $self->get_value_( $session, $bucket, $word ); if ( $v == 0 ) { my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); return $self->{not_likely__}{$userid}; } else { return $v; } } #---------------------------------------------------------------------------- # # update_constants__ # # Updates not_likely and bucket_start # #---------------------------------------------------------------------------- sub update_constants__ { my ( $self, $session ) = @_; my $wc = $self->get_word_count( $session ); my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); if ( $wc > 0 ) { $self->{not_likely__}{$userid} = -log( 10 * $wc ); foreach my $bucket ($self->get_buckets( $session )) { my $total = $self->get_bucket_word_count( $session, $bucket ); if ( $total != 0 ) { $self->{bucket_start__}{$userid}{$bucket} = log( $total / $wc ); } else { $self->{bucket_start__}{$userid}{$bucket} = 0; } } } else { $self->{not_likely__}{$userid} = 0; } } #---------------------------------------------------------------------------- # # db_connect__ # # Connects to the POPFile database and returns 1 if successful # #---------------------------------------------------------------------------- sub db_connect__ { my ( $self ) = @_; # Connect to the database, note that the database must exist for # this to work, to make this easy for people POPFile we will # create the database automatically here using the file # 'popfile.sql' which should be located in the same directory the # Classifier/Bayes.pm module # If we are using SQLite then the dbname is actually the name of a # file, and hence we treat it like one, otherwise we leave it # alone my $dbname; my $dbconnect = $self->config_( 'dbconnect' ); my $dbpresent; my $sqlite = ( $dbconnect =~ /sqlite/i ); if ( $sqlite ) { $dbname = $self->get_user_path_( $self->config_( 'database' ) ); $dbpresent = ( -e $dbname ) || 0; } else { $dbname = $self->config_( 'database' ); $dbpresent = 1; } # Now perform the connect, note that this is database independent # at this point, the actual database that we connect to is defined # by the dbconnect parameter. $dbconnect =~ s/\$dbname/$dbname/g; $self->log_( 0, "Attempting to connect to $dbconnect ($dbpresent)" ); $self->{db__} = DBI->connect( $dbconnect, # PROFILE BLOCK START $self->config_( 'dbuser' ), $self->config_( 'dbauth' ) ); # PROFILE BLOCK STOP $self->log_( 0, "Using SQLite library version " . $self->{db__}{sqlite_version}); # We check to make sure we're not using DBD::SQLite 1.05 or greater # which uses SQLite V 3 If so, we'll use DBD::SQLite2 and SQLite 2.8, # which is still compatible with old databases if ( $self->{db__}{sqlite_version} gt $self->config_('bad_sqlite_version' ) ) { $self->log_( 0, "Substituting DBD::SQLite2 for DBD::SQLite 1.05" ); $self->log_( 0, "Please install DBD::SQLite2 and set dbconnect to use DBD::SQLite2" ); $dbconnect =~ s/SQLite/SQLite2/; undef $self->{db__}; # $self->db_disconnect__(); $self->{db__} = DBI->connect( $dbconnect, # PROFILE BLOCK START $self->config_( 'dbuser' ), $self->config_( 'dbauth' ) ); # PROFILE BLOCK STOP } if ( !defined( $self->{db__} ) ) { $self->log_( 0, "Failed to connect to database and got error $DBI::errstr" ); return 0; } if ( !$dbpresent ) { if ( !$self->insert_schema__( $sqlite ) ) { return 0; } } # Now check for a need to upgrade the database because the schema # has been changed. From POPFile v0.22.0 there's a special # 'popfile' table inside the database that contains the schema # version number. If the version number doesn't match or is # missing then do the upgrade. open SCHEMA, '<' . $self->get_root_path_( 'Classifier/popfile.sql' ); <SCHEMA> =~ /-- POPFILE SCHEMA (\d+)/; my $version = $1; close SCHEMA; my $need_upgrade = 1; # # retrieve the SQL_IDENTIFIER_QUOTE_CHAR for the database then use it # to strip off any sqlquotechars from the table names we retrieve # my $sqlquotechar = $self->{db__}->get_info(29) || ''; my @tables = map { s/$sqlquotechar//g; $_ } ($self->{db__}->tables()); foreach my $table (@tables) { if ( $table eq 'popfile' ) { my @row = $self->{db__}->selectrow_array( 'select version from popfile;' ); if ( $#row == 0 ) { $need_upgrade = ( $row[0] != $version ); } } } if ( $need_upgrade ) { print "\n\nDatabase schema is outdated, performing automatic upgrade\n"; # The database needs upgrading, so we are going to dump out # all the data in the database as INSERT statements in a # temporary file, then DROP all the tables in the database, # then recreate the schema from the new schema and finally # rerun the inserts. my $i = 0; my $ins_file = $self->get_user_path_( 'insert.sql' ); open INSERT, '>' . $ins_file; foreach my $table (@tables) { next if ( $table eq 'popfile' ); if ( $sqlite && ( $table =~ /^sqlite_/ ) ) { next; } if ( $i > 99 ) { print "\n"; } print " Saving table $table\n "; my $t = $self->{db__}->prepare( "select * from $table;" ); $t->execute; $i = 0; while ( 1 ) { if ( ( ++$i % 100 ) == 0 ) { print "[$i]"; flush STDOUT; } my @rows = $t->fetchrow_array; last if ( $#rows == -1 ); print INSERT "INSERT INTO $table ("; for my $i (0..$t->{NUM_OF_FIELDS}-1) { if ( $i != 0 ) { print INSERT ','; } print INSERT $t->{NAME}->[$i]; } print INSERT ') VALUES ('; for my $i (0..$t->{NUM_OF_FIELDS}-1) { if ( $i != 0 ) { print INSERT ','; } my $val = $rows[$i]; if ( $t->{TYPE}->[$i] !~ /^int/i ) { $val = '' if ( !defined( $val ) ); $val = $self->{db__}->quote( $val ); } else { $val = 'NULL' if ( !defined( $val ) ); } print INSERT $val; } print INSERT ");\n"; } } close INSERT; if ( $i > 99 ) { print "\n"; } foreach my $table (@tables) { if ( $sqlite && ( $table =~ /^sqlite_/ ) ) { next; } print " Dropping old table $table\n"; $self->{db__}->do( "DROP TABLE $table;" ); } print " Inserting new database schema\n"; if ( !$self->insert_schema__( $sqlite ) ) { return 0; } print " Restoring old data\n "; $self->{db__}->begin_work; open INSERT, '<' . $ins_file; $i = 0; while ( <INSERT> ) { if ( ( ++$i % 100 ) == 0 ) { print "[$i]"; flush STDOUT; } s/[\r\n]//g; $self->{db__}->do( $_ ); } close INSERT; $self->{db__}->commit; unlink $ins_file; print "\nDatabase upgrade complete\n\n"; } # Now prepare common SQL statements for use, as a matter of convention the # parameters to each statement always appear in the following order: # # user # bucket # word # parameter $self->{db_get_buckets__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select name, id, pseudo from buckets where buckets.userid = ?;' ); # PROFILE BLOCK STOP $self->{db_get_wordid__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select id from words where words.word = ? limit 1;' ); # PROFILE BLOCK STOP $self->{db_get_userid__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select id from users where name = ? and password = ? limit 1;' ); # PROFILE BLOCK STOP $self->{db_get_word_count__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select matrix.times from matrix where matrix.bucketid = ? and matrix.wordid = ? limit 1;' ); # PROFILE BLOCK STOP $self->{db_put_word_count__} = $self->{db__}->prepare( # PROFILE BLOCK START 'replace into matrix ( bucketid, wordid, times ) values ( ?, ?, ? );' ); # PROFILE BLOCK STOP $self->{db_get_bucket_unique_counts__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select count(matrix.wordid), buckets.name from matrix, buckets where buckets.userid = ? and matrix.bucketid = buckets.id group by buckets.name;' ); # PROFILE BLOCK STOP $self->{db_get_bucket_word_counts__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select sum(matrix.times), buckets.name from matrix, buckets where matrix.bucketid = buckets.id and buckets.userid = ? group by buckets.name;' ); # PROFILE BLOCK STOP $self->{db_get_unique_word_count__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select count(matrix.wordid) from matrix, buckets where matrix.bucketid = buckets.id and buckets.userid = ?;' ); # PROFILE BLOCK STOP $self->{db_get_full_total__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select sum(matrix.times) from matrix, buckets where buckets.userid = ? and matrix.bucketid = buckets.id;' ); # PROFILE BLOCK STOP $self->{db_get_bucket_parameter__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select bucket_params.val from bucket_params where bucket_params.bucketid = ? and bucket_params.btid = ?;' ); # PROFILE BLOCK STOP $self->{db_set_bucket_parameter__} = $self->{db__}->prepare( # PROFILE BLOCK START 'replace into bucket_params ( bucketid, btid, val ) values ( ?, ?, ? );' ); # PROFILE BLOCK STOP $self->{db_get_bucket_parameter_default__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select bucket_template.def from bucket_template where bucket_template.id = ?;' ); # PROFILE BLOCK STOP $self->{db_get_buckets_with_magnets__} = $self->{db__}->prepare( # PROFILE BLOCK START 'select buckets.name from buckets, magnets where buckets.userid = ? and magnets.id != 0 and magnets.bucketid = buckets.id group by buckets.name order by buckets.name;' ); # PROFILE BLOCK STOP $self->{db_delete_zero_words__} = $self->{db__}->prepare( # PROFILE BLOCK START 'delete from matrix where matrix.times = 0 and matrix.bucketid = ?;' ); # PROFILE BLOCK STOP # Get the mapping from parameter names to ids into a local hash my $h = $self->{db__}->prepare( "select name, id from bucket_template;" ); $h->execute; while ( my $row = $h->fetchrow_arrayref ) { $self->{db_parameterid__}{$row->[0]} = $row->[1]; } $h->finish; return 1; } #---------------------------------------------------------------------------- # # insert_schema__ # # Insert the POPFile schema in a database # # $sqlite Set to 1 if this is a SQLite database # #---------------------------------------------------------------------------- sub insert_schema__ { my ( $self, $sqlite ) = @_; if ( -e $self->get_root_path_( 'Classifier/popfile.sql' ) ) { my $schema = ''; $self->log_( 0, "Creating database schema" ); open SCHEMA, '<' . $self->get_root_path_( 'Classifier/popfile.sql' ); while ( <SCHEMA> ) { next if ( /^--/ ); next if ( !/[a-z;]/ ); s/--.*$//; # If the line begins 'alter' and we are doing SQLite then ignore # the line if ( $sqlite && ( /^alter/i ) ) { next; } $schema .= $_; if ( ( /end;/ ) || ( /\);/ ) || ( /^alter/i ) ) { $self->{db__}->do( $schema ); $schema = ''; } } close SCHEMA; return 1; } else { $self->log_( 0, "Can't find the database schema" ); return 0; } } #---------------------------------------------------------------------------- # # db_disconnect__ # # Disconnect from the POPFile database # #---------------------------------------------------------------------------- sub db_disconnect__ { my ( $self ) = @_; $self->{db_get_buckets__}->finish; $self->{db_get_wordid__}->finish; $self->{db_get_userid__}->finish; $self->{db_get_word_count__}->finish; $self->{db_put_word_count__}->finish; $self->{db_get_bucket_unique_counts__}->finish; $self->{db_get_bucket_word_counts__}->finish; $self->{db_get_unique_word_count__}->finish; $self->{db_get_full_total__}->finish; $self->{db_get_bucket_parameter__}->finish; $self->{db_set_bucket_parameter__}->finish; $self->{db_get_bucket_parameter_default__}->finish; $self->{db_get_buckets_with_magnets__}->finish; $self->{db_delete_zero_words__}->finish; if ( defined( $self->{db__} ) ) { $self->{db__}->disconnect; undef $self->{db__}; } } #---------------------------------------------------------------------------- # # db_update_cache__ # # Updates our local cache of user and bucket ids. # # $session Must be a valid session # #---------------------------------------------------------------------------- sub db_update_cache__ { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); delete $self->{db_bucketid__}{$userid}; $self->{db_get_buckets__}->execute( $userid ); while ( my $row = $self->{db_get_buckets__}->fetchrow_arrayref ) { $self->{db_bucketid__}{$userid}{$row->[0]}{id} = $row->[1]; $self->{db_bucketid__}{$userid}{$row->[0]}{pseudo} = $row->[2]; $self->{db_bucketcount__}{$userid}{$row->[0]} = 0; } $self->{db_get_bucket_word_counts__}->execute( $userid ); for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) { $self->{db_bucketcount__}{$userid}{$b} = 0; $self->{db_bucketunique__}{$userid}{$b} = 0; } while ( my $row = $self->{db_get_bucket_word_counts__}->fetchrow_arrayref ) { $self->{db_bucketcount__}{$userid}{$row->[1]} = $row->[0]; } $self->{db_get_bucket_unique_counts__}->execute( $userid ); while ( my $row = $self->{db_get_bucket_unique_counts__}->fetchrow_arrayref ) { $self->{db_bucketunique__}{$userid}{$row->[1]} = $row->[0]; } $self->update_constants__( $session ); } #---------------------------------------------------------------------------- # # db_get_word_count__ # # Return the 'count' value for a word in a bucket. If the word is not # found in that bucket then returns undef. # # $session Valid session ID from get_session_key # $bucket bucket word is in # $word word to lookup # #---------------------------------------------------------------------------- sub db_get_word_count__ { my ( $self, $session, $bucket, $word ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $self->{db_get_wordid__}->execute( $word ); my $result = $self->{db_get_wordid__}->fetchrow_arrayref; if ( !defined( $result ) ) { return undef; } my $wordid = $result->[0]; $self->{db_get_word_count__}->execute( $self->{db_bucketid__}{$userid}{$bucket}{id}, $wordid ); $result = $self->{db_get_word_count__}->fetchrow_arrayref; if ( defined( $result ) ) { return $result->[0]; } else { return undef; } } #---------------------------------------------------------------------------- # # db_put_word_count__ # # Update 'count' value for a word in a bucket, if the update fails # then returns 0 otherwise is returns 1 # # $session Valid session ID from get_session_key # $bucket bucket word is in # $word word to update # $count new count value # #---------------------------------------------------------------------------- sub db_put_word_count__ { my ( $self, $session, $bucket, $word, $count ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # We need to have two things before we can start, the id of the # word in the words table (if there's none then we need to add the # word), the bucket id in the buckets table (which must exist) $word = $self->{db__}->quote($word); my $result = $self->{db__}->selectrow_arrayref( "select words.id from words where words.word = $word limit 1;"); if ( !defined( $result ) ) { $self->{db__}->do( "insert into words ( word ) values ( $word );" ); $result = $self->{db__}->selectrow_arrayref( "select words.id from words where words.word = $word limit 1;"); } my $wordid = $result->[0]; my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; $self->{db_put_word_count__}->execute( $bucketid, $wordid, $count ); return 1; } #---------------------------------------------------------------------------- # # upgrade_predatabase_data__ # # Looks for old POPFile data (in flat files or BerkeleyDB tables) and # upgrades it to the SQL database. Data upgraded is removed. # #---------------------------------------------------------------------------- sub upgrade_predatabase_data__ { my ( $self ) = @_; my $c = 0; # There's an assumption here that this is the single user version # of POPFile and hence what we do is cheat and get a session key # assuming that the user name is admin with password '' my $session = $self->get_session_key( 'admin', '' ); if ( !defined( $session ) ) { $self->log_( 0, "Tried to get the session key for user admin and failed; cannot upgrade old data" ); return; } my @buckets = glob $self->get_user_path_( $self->config_( 'corpus' ) . '/*' ); foreach my $bucket (@buckets) { # A bucket directory must be a directory next unless ( -d $bucket ); next unless ( ( -e "$bucket/table" ) || ( -e "$bucket/table.db" ) ); return 0 if ( !$self->upgrade_bucket__( $session, $bucket ) ); my $color = ''; # See if there's a color file specified if ( open COLOR, '<' . "$bucket/color" ) { $color = <COLOR>; # Someone (who shall remain nameless) went in and manually created # empty color files in their corpus directories which would cause # $color at this point to be undefined and hence you'd get warnings # about undefined variables below. So this little test is to deal # with that user and to make POPFile a little safer which is always # a good thing if ( !defined( $color ) ) { $color = ''; } else { $color =~ s/[\r\n]//g; } close COLOR; unlink "$bucket/color"; } $bucket =~ /([[:alpha:]0-9-_]+)$/; $bucket = $1; $self->set_bucket_color( $session, $bucket, ($color eq '')?$self->{possible_colors__}[$c]:$color ); $c = ($c+1) % ($#{$self->{possible_colors__}}+1); } $self->release_session_key( $session ); return 1; } #---------------------------------------------------------------------------- # # upgrade_bucket__ # # Loads an individual bucket # # $session Valid session key from get_session_key # $bucket The bucket name # #---------------------------------------------------------------------------- sub upgrade_bucket__ { my ( $self, $session, $bucket ) = @_; $bucket =~ /([[:alpha:]0-9-_]+)$/; $bucket = $1; $self->create_bucket( $session, $bucket ); if ( open PARAMS, '<' . $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/params" ) ) { while ( <PARAMS> ) { s/[\r\n]//g; if ( /^([[:lower:]]+) ([^\r\n\t ]+)$/ ) { $self->set_bucket_parameter( $session, $bucket, $1, $2 ); } } close PARAMS; unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/params" ); } # Pre v0.21.0 POPFile had GLOBAL parameters for subject modification, # XTC and XPL insertion. To make the upgrade as clean as possible # check these parameters so that if they were OFF we set the equivalent # per bucket to off foreach my $gl ( 'subject', 'xtc', 'xpl' ) { $self->log_( 1, "Checking deprecated parameter GLOBAL_$gl for $bucket\n" ); my $val = $self->{configuration__}->deprecated_parameter( "GLOBAL_$gl" ); if ( defined( $val ) && ( $val == 0 ) ) { $self->log_( 1, "GLOBAL_$gl is 0 for $bucket, overriding $gl\n" ); $self->set_bucket_parameter( $session, $bucket, $gl, 0 ); } } # See if there are magnets defined if ( open MAGNETS, '<' . $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/magnets" ) ) { while ( <MAGNETS> ) { s/[\r\n]//g; # Because of a bug in v0.17.9 and earlier of POPFile the text of # some magnets was getting mangled by certain characters having # a \ prepended. Code here removes the \ in these cases to make # an upgrade smooth. if ( /^([^ ]+) (.+)$/ ) { my $type = $1; my $value = $2; # Some people were accidently creating magnets with # trailing whitespace which really confused them later # when their magnet did not match (see comment in # UI::HTML::magnet for more detail) $value =~ s/^[ \t]+//g; $value =~ s/[ \t]+$//g; $value =~ s/\$\?|\*|\||\(|$|\[|\]|\{|\}|\^|\$|\.)/$1/g; $self->create_magnet( $session, $bucket, $type, $value ); } else { # This branch is used to catch the original magnets in an # old version of POPFile that were just there for from # addresses only if ( /^(.+)$/ ) { my $value = $1; $value =~ s/\$\?|\*|\||\(|$|\[|\]|\{|\}|\^|\$|\.)/$1/g; $self->create_magnet( $session, $bucket, 'from', $value ); } } } close MAGNETS; unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/magnets" ); } # If there is no existing table but there is a table file (the old style # flat file used by POPFile for corpus storage) then create the new # database from it thus performing an automatic upgrade. if ( -e $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" ) ) { $self->log_( 0, "Performing automatic upgrade of $bucket corpus from flat file to DBI" ); $self->{db__}->begin_work; if ( open WORDS, '<' . $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" ) ) { my $wc = 1; my $first = <WORDS>; if ( defined( $first ) && ( $first =~ s/^__CORPUS__ __VERSION__ (\d+)// ) ) { if ( $1 != $self->{corpus_version__} ) { print STDERR "Incompatible corpus version in $bucket\n"; close WORDS; $self->{db__}->rollback; return 0; } else { $self->log_( 0, "Upgrading bucket $bucket..." ); while ( <WORDS> ) { if ( $wc % 100 == 0 ) { $self->log_( 0, "$wc" ); } $wc += 1; s/[\r\n]//g; if ( /^([^\s]+) (\d+)$/ ) { if ( $2 != 0 ) { $self->db_put_word_count__( $session, $bucket, $1, $2 ); } } else { $self->log_( 0, "Found entry in corpus for $bucket that looks wrong: \"$_\" (ignoring)" ); } } } if ( $wc > 1 ) { $wc -= 1; $self->log_( 0, "(completed $wc words)" ); } close WORDS; } else { close WORDS; $self->{db__}->rollback; unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" ); return 0; } $self->{db__}->commit; unlink $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table" ); } } # Now check to see if there's a BerkeleyDB-style table my $bdb_file = $self->get_user_path_( $self->config_( 'corpus' ) . "/$bucket/table.db" ); if ( -e $bdb_file ) { $self->log_( 0, "Performing automatic upgrade of $bucket corpus from BerkeleyDB to DBI" ); require BerkeleyDB; my %h; tie %h, "BerkeleyDB::Hash", -Filename => $bdb_file; $self->log_( 0, "Upgrading bucket $bucket..." ); $self->{db__}->begin_work; my $wc = 1; for my $word (keys %h) { if ( $wc % 100 == 0 ) { $self->log_( 0, "$wc" ); } next if ( $word =~ /__POPFILE__(LOG__TOTAL|TOTAL|UNIQUE)__/ ); $wc += 1; if ( $h{$word} != 0 ) { $self->db_put_word_count__( $session, $bucket, $word, $h{$word} ); } } $wc -= 1; $self->log_( 0, "(completed $wc words)" ); $self->{db__}->commit; untie %h; unlink $bdb_file; } return 1; } #---------------------------------------------------------------------------- # # magnet_match_helper__ # # Helper the determines if a specific string matches a certain magnet # type in a bucket, used by magnet_match_ # # $session Valid session from get_session_key # $match The string to match # $bucket The bucket to check # $type The magnet type to check # #---------------------------------------------------------------------------- sub magnet_match_helper__ { my ( $self, $session, $match, $bucket, $type ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $match = lc($match); # In Japanese and Korean mode, disable locale. Sorting Japanese # and Korean with "use locale" is memory and time consuming, and # may cause perl crash. my @magnets; my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; my $h = $self->{db__}->prepare( # PROFILE BLOCK START "select magnets.val, magnets.id from magnets, users, buckets, magnet_types where buckets.id = $bucketid and magnets.id != 0 and users.id = buckets.userid and magnets.bucketid = buckets.id and magnet_types.mtype = '$type' and magnets.mtid = magnet_types.id order by magnets.val;" ); # PROFILE BLOCK STOP $h->execute; while ( my $row = $h->fetchrow_arrayref ) { push @magnets, [$row->[0], $row->[1]]; } $h->finish; foreach my $m (@magnets) { my ( $magnet, $id ) = @{$m}; $magnet = lc($magnet); for my $i (0..(length($match)-length($magnet))) { if ( substr( $match, $i, length($magnet)) eq $magnet ) { $self->{magnet_used__} = 1; $self->{magnet_detail__} = $id; return 1; } } } return 0; } #---------------------------------------------------------------------------- # # magnet_match__ # # Helper the determines if a specific string matches a certain magnet # type in a bucket # # $session Valid session from get_session_key # $match The string to match # $bucket The bucket to check # $type The magnet type to check # #---------------------------------------------------------------------------- sub magnet_match__ { my ( $self, $session, $match, $bucket, $type ) = @_; return $self->magnet_match_helper__( $session, $match, $bucket, $type ); } #---------------------------------------------------------------------------- # # write_line__ # # Writes a line to a file and parses it unless the classification is # already known # # $file File handle for file to write line to # $line The line to write # $class (optional) The current classification # #---------------------------------------------------------------------------- sub write_line__ { my ( $self, $file, $line, $class ) = @_; print $file $line if defined( $file ); if ( $class eq '' ) { $self->{parser__}->parse_line( $line ); } } #---------------------------------------------------------------------------- # # add_words_to_bucket__ # # Takes words previously parsed by the mail parser and adds/subtracts # them to/from a bucket, this is a helper used by # add_messages_to_bucket, remove_message_from_bucket # # $session Valid session from get_session_key # $bucket Bucket to add to # $subtract Set to -1 means subtract the words, set to 1 means add # #---------------------------------------------------------------------------- sub add_words_to_bucket__ { my ( $self, $session, $bucket, $subtract ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # Map the list of words to a list of counts currently in the database # then update those counts and write them back to the database. my $words; $words = join( ',', map( $self->{db__}->quote( $_ ), (sort keys %{$self->{parser__}{words__}}) ) ); $self->{get_wordids__} = $self->{db__}->prepare( # PROFILE BLOCK START "select id, word from words where word in ( $words );" ); # PROFILE BLOCK STOP $self->{get_wordids__}->execute; my @id_list; my %wordmap; while ( my $row = $self->{get_wordids__}->fetchrow_arrayref ) { push @id_list, ($row->[0]); $wordmap{$row->[1]} = $row->[0]; } $self->{get_wordids__}->finish; my $ids = join( ',', @id_list ); $self->{db_getwords__} = $self->{db__}->prepare( # PROFILE BLOCK START "select matrix.times, matrix.wordid from matrix where matrix.wordid in ( $ids ) and matrix.bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id};" ); # PROFILE BLOCK STOP $self->{db_getwords__}->execute; my %counts; while ( my $row = $self->{db_getwords__}->fetchrow_arrayref ) { $counts{$row->[1]} = $row->[0]; } $self->{db_getwords__}->finish; $self->{db__}->begin_work; foreach my $word (keys %{$self->{parser__}->{words__}}) { # If there's already a count then it means that the word is # already in the database and we have its id in # $wordmap{$word} so for speed we execute the # db_put_word_count__ query here rather than going through # set_value_ which would need to look up the wordid again if ( defined( $wordmap{$word} ) && defined( $counts{$wordmap{$word}} ) ) { $self->{db_put_word_count__}->execute( $self->{db_bucketid__}{$userid}{$bucket}{id}, # PROFILE BLOCK START $wordmap{$word}, $counts{$wordmap{$word}} + $subtract * $self->{parser__}->{words__}{$word} ); # PROFILE BLOCK STOP } else { # If the word is not in the database and we are trying to # subtract then we do nothing because negative values are # meaningless if ( $subtract == 1 ) { $self->db_put_word_count__( $session, $bucket, $word, $self->{parser__}->{words__}{$word} ); } } } # If we were doing a subtract operation it's possible that some of # the words in the bucket now have a zero count and should be # removed if ( $subtract == -1 ) { $self->{db_delete_zero_words__}->execute( $self->{db_bucketid__}{$userid}{$bucket}{id} ); } $self->{db__}->commit; } #---------------------------------------------------------------------------- # # echo_to_dot_ # # $mail The stream (created with IO::) to send the message to (the # remote mail server) # $client (optional) The local mail client (created with IO::) that # needs the response # $file (optional) A file to print the response to, caller specifies # open style # $before (optional) String to send to client before the dot is sent # # echo all information from the $mail server until a single line with # a . is seen # # NOTE Also echoes the line with . to $client but not to $file # # Returns 1 if there was a . or 0 if reached EOF before we hit the . # #---------------------------------------------------------------------------- sub echo_to_dot_ { my ( $self, $mail, $client, $file, $before ) = @_; my $hit_dot = 0; my $isopen = open FILE, "$file" if ( defined( $file ) ); binmode FILE if ($isopen); while ( my $line = $self->slurp_( $mail ) ) { # Check for an abort last if ( $self->{alive_} == 0 ); # The termination has to be a single line with exactly a dot # on it and nothing else other than line termination # characters. This is vital so that we do not mistake a line # beginning with . as the end of the block if ( $line =~ /^\.(\r\n|\r|\n)$/ ) { $hit_dot = 1; if ( defined( $before ) && ( $before ne '' ) ) { print $client $before if ( defined( $client ) ); print FILE $before if ( defined( $isopen ) ); } # Note that there is no print FILE here. This is correct # because we do no want the network terminator . to appear # in the file version of any message print $client $line if ( defined( $client ) ); last; } print $client $line if ( defined( $client ) ); print FILE $line if ( defined( $isopen ) ); } close FILE if ( $isopen ); return $hit_dot; } #---------------------------------------------------------------------------- # # substr_euc__ # # "substr" function which supports EUC Japanese charset # # $pos Start position # $len Word length # #---------------------------------------------------------------------------- sub substr_euc__ { my ( $str, $pos, $len ) = @_; my $result_str; my $char; my $count = 0; if ( !$pos ) { $pos = 0; } if ( !$len ) { $len = length( $str ); } for ( $pos = 0; $count < $len; $pos++ ) { $char = substr( $str, $pos, 1 ); if ( $char =~ /[\x80-\xff]/ ) { $char = substr( $str, $pos++, 2 ); } $result_str .= $char; $count++; } return $result_str; } #---------------------------------------------------------------------------- # # generate_unique_session_key__ # # Returns a unique string based session key that can be used as a key # in the api_sessions__ # #---------------------------------------------------------------------------- sub generate_unique_session_key__ { my ( $self ) = @_; my @chars = ( 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', # PROFILE BLOCK START 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A' ); # PROFILE BLOCK STOP my $session; do { $session = ''; my $length = int( 16 + rand(4) ); for my $i (0 .. $length) { my $random = $chars[int( rand(36) )]; # Just to add spice to things we sometimes lowercase the value if ( rand(1) < rand(1) ) { $random = lc($random); } $session .= $random; } } while ( defined( $self->{api_sessions__}{$session} ) ); return $session; } #---------------------------------------------------------------------------- # # release_session_key_private__ # # $session A session key previously returned by get_session_key # # Releases and invalidates the session key. Worker function that does the work # of release_session_key. # ****DO NOT CALL DIRECTLY**** # unless you want your session key released immediately, possibly preventing # asynchronous tasks from completing # #---------------------------------------------------------------------------- sub release_session_key_private__ { my ( $self, $session ) = @_; if ( defined( $self->{api_sessions__}{$session} ) ) { $self->log_( 1, "release_session_key releasing key $session for user $self->{api_sessions__}{$session}" ); delete $self->{api_sessions__}{$session}; } } #---------------------------------------------------------------------------- # # valid_session_key__ # # $session Session key returned by call to get_session_key # # Returns undef is the session key is not valid, or returns the user # ID associated with the session key which can be used in database # accesses # #---------------------------------------------------------------------------- sub valid_session_key__ { my ( $self, $session ) = @_; # This provides protection against someone using the XML-RPC # interface and calling this API directly to fish for session # keys, this must be called from within this module return undef if ( caller ne 'Classifier::Bayes' ); # If the session key is invalid then wait 1 second. This is done # to prevent people from calling a POPFile API such as # get_bucket_count with random session keys fishing for a valid # key. The XML-RPC API is single threaded and hence this will # delay all use of that API by one second. Of course in normal # use when the user knows the username/password or session key # then there is no delay if ( !defined( $self->{api_sessions__}{$session} ) ) { my ( $package, $filename, $line, $subroutine ) = caller; $self->log_( 0, "Invalid session key $session provided in $package @ $line" ); select( undef, undef, undef, 1 ); } return $self->{api_sessions__}{$session}; } #---------------------------------------------------------------------------- #---------------------------------------------------------------------------- # _____ _____ _____ _______ _____ _______ _______ _____ _____ #|_____] | | |_____] |______ | | |______ |_____| |_____] | #| |_____| | | __|__ |_____ |______ | | | __|__ # # The method below are public and may be accessed by other modules. # All of them may be accessed remotely through the XMLRPC.pm module # using the XML-RPC protocol # # Note that every API function expects to be passed a $session which # is obtained by first calling get_session_key with a valid username # and password. Once done call the method release_session_key. # # See POPFile::API for more details # #---------------------------------------------------------------------------- #---------------------------------------------------------------------------- #---------------------------------------------------------------------------- # # get_session_key # # $user The name of an existing user # $pwd The user's password # # Returns a string based session key if the username and password # match, or undef if not # #---------------------------------------------------------------------------- sub get_session_key { my ( $self, $user, $pwd ) = @_; # The password is stored in the database as an MD5 hash of the # username and password concatenated and separated by the string # __popfile__, so compute the hash here my $hash = md5_hex( $user . '__popfile__' . $pwd ); $self->{db_get_userid__}->execute( $user, $hash ); my $result = $self->{db_get_userid__}->fetchrow_arrayref; if ( !defined( $result ) ) { # The delay of one second here is to prevent people from trying out # username/password combinations at high speed to determine the # credentials of a valid user $self->log_( 0, "Attempt to login with incorrect credentials for user $user" ); select( undef, undef, undef, 1 ); return undef; } my $session = $self->generate_unique_session_key__(); $self->{api_sessions__}{$session} = $result->[0]; $self->db_update_cache__( $session ); $self->log_( 1, "get_session_key returning key $session for user $self->{api_sessions__}{$session}" ); return $session; } #---------------------------------------------------------------------------- # # release_session_key # # $session A session key previously returned by get_session_key # # Releases and invalidates the session key # #---------------------------------------------------------------------------- sub release_session_key { my ( $self, $session ) = @_; $self->mq_post_( "RELSE", $session ); } #---------------------------------------------------------------------------- # # get_top_bucket__ # # Helper function used by classify to get the bucket with the highest # score from data stored in a matrix of information (see definition of # %matrix in classify for details) and a list of potential buckets # # $userid User ID for database access # $id ID of a word in $matrix # $matrix Reference to the %matrix hash in classify # $buckets Reference to a list of buckets # # Returns the bucket in $buckets with the highest score # #---------------------------------------------------------------------------- sub get_top_bucket__ { my ( $self, $userid, $id, $matrix, $buckets ) = @_; my $best_probability = 0; my $top_bucket = 'unclassified'; for my $bucket (@$buckets) { my $probability = 0; if ( defined($$matrix{$id}{$bucket}) && ( $$matrix{$id}{$bucket} > 0 ) ) { $probability = $$matrix{$id}{$bucket} / $self->{db_bucketcount__}{$userid}{$bucket}; } if ( $probability > $best_probability ) { $best_probability = $probability; $top_bucket = $bucket; } } return $top_bucket; } #---------------------------------------------------------------------------- # # classify # # $session A valid session key returned by a call to get_session_key # $file The name of the file containing the text to classify (or undef # to use the data already in the parser) # $templ Reference to the UI template used for word score display # $matrix (optional) Reference to a hash that will be filled with the # word matrix used in classification # $idmap (optional) Reference to a hash that will map word ids in the # $matrix to actual words # # Splits the mail message into valid words, then runs the Bayes # algorithm to figure out which bucket it belongs in. Returns the # bucket name # #---------------------------------------------------------------------------- sub classify { my ( $self, $session, $file, $templ, $matrix, $idmap ) = @_; my $msg_total = 0; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $self->{unclassified__} = log( $self->config_( 'unclassified_weight' ) ); $self->{magnet_used__} = 0; $self->{magnet_detail__} = 0; if ( defined( $file ) ) { $self->{parser__}->parse_file( $file, # PROFILE BLOCK START $self->global_config_( 'message_cutoff' ) ); # PROFILE BLOCK STOP } # Check to see if this email should be classified based on a magnet # Get the list of buckets my @buckets = $self->get_buckets( $session ); for my $bucket ($self->get_buckets_with_magnets( $session )) { for my $type ($self->get_magnet_types_in_bucket( $session, $bucket )) { if ( $self->magnet_match__( $session, $self->{parser__}->get_header($type), $bucket, $type ) ) { return $bucket; } } } # If the user has not defined any buckets then we escape here # return unclassified return "unclassified" if ( $#buckets == -1 ); # The score hash will contain the likelihood that the given # message is in each bucket, the buckets are the keys for score # Set up the initial score as P(bucket) my %score; my %matchcount; # Build up a list of the buckets that are OK to use for # classification (i.e. that have at least one word in them). my @ok_buckets; for my $bucket (@buckets) { if ( $self->{bucket_start__}{$userid}{$bucket} != 0 ) { $score{$bucket} = $self->{bucket_start__}{$userid}{$bucket}; $matchcount{$bucket} = 0; push @ok_buckets, ( $bucket ); } } @buckets = @ok_buckets; # For each word go through the buckets and calculate # P(word|bucket) and then calculate P(word|bucket) ^ word count # and multiply to the score my $word_count = 0; # The correction value is used to generate score displays variable # which are consistent with the word scores shown by the GUI's # word lookup feature. It is computed to make the contribution of # a word which is unrepresented in a bucket zero. This correction # affects only the values displayed in the display; it has no # effect on the classification process. my $correction = 0; # Classification against the database works in a sequence of steps # to get the fastest time possible. The steps are as follows: # # 1. Convert the list of words returned by the parser into a list # of unique word ids that can be used in the database. This # requires a select against the database to get the word ids # (and associated words) which is then converted into two # things: @id_list which is just the sorted list of word ids # and %idmap which maps a word to its id. # # 2. Then run a second select that get the triplet (count, id, # bucket) for each word id and each bucket. The triplet # contains the word count from the database for each bucket and # each id, where there is an entry. That data gets loaded into # the sparse matrix %matrix. # # 3. Do the normal classification loop as before running against # the @id_list for the words and for each bucket. If there's an # entry in %matrix for the id/bucket combination then calculate # the probability, otherwise use the not_likely probability. # # NOTE. Since there is a single not_likely probability we do not # worry about the fact that the select in 1 might return a shorter # list of words than was found in the message (because some words # are not in the database) since the missing words will be the # same for all buckets and hence constitute a fixed scaling factor # on all the buckets which is irrelevant in deciding which the # winning bucket is. my $words; $words = join( ',', map( $self->{db__}->quote( $_ ), (sort keys %{$self->{parser__}{words__}}) ) ); $self->{get_wordids__} = $self->{db__}->prepare( # PROFILE BLOCK START "select id, word from words where word in ( $words ) order by id;" ); # PROFILE BLOCK STOP $self->{get_wordids__}->execute; my @id_list; my %temp_idmap; if ( !defined( $idmap ) ) { $idmap = \%temp_idmap; } while ( my $row = $self->{get_wordids__}->fetchrow_arrayref ) { push @id_list, ($row->[0]); $$idmap{$row->[0]} = $row->[1]; } $self->{get_wordids__}->finish; my $ids = join( ',', @id_list ); $self->{db_classify__} = $self->{db__}->prepare( # PROFILE BLOCK START "select matrix.times, matrix.wordid, buckets.name from matrix, buckets where matrix.wordid in ( $ids ) and matrix.bucketid = buckets.id and buckets.userid = $userid;" ); # PROFILE BLOCK STOP $self->{db_classify__}->execute; # %matrix maps wordids and bucket names to counts # $matrix{$wordid}{$bucket} == $count my %temp_matrix; if ( !defined( $matrix ) ) { $matrix = \%temp_matrix; } while ( my $row = $self->{db_classify__}->fetchrow_arrayref ) { $$matrix{$row->[1]}{$row->[2]} = $row->[0]; } $self->{db_classify__}->finish; foreach my $id (@id_list) { $word_count += 2; my $wmax = -10000; foreach my $bucket (@buckets) { my $probability = 0; if ( defined($$matrix{$id}{$bucket}) && ( $$matrix{$id}{$bucket} > 0 ) ) { $probability = log( $$matrix{$id}{$bucket} / $self->{db_bucketcount__}{$userid}{$bucket} ); } $matchcount{$bucket} += $self->{parser__}{words__}{$$idmap{$id}} if ($probability != 0); $probability = $self->{not_likely__}{$userid} if ( $probability == 0 ); $wmax = $probability if ( $wmax < $probability ); $score{$bucket} += ( $probability * $self->{parser__}{words__}{$$idmap{$id}} ); } if ($wmax > $self->{not_likely__}{$userid}) { $correction += $self->{not_likely__}{$userid} * $self->{parser__}{words__}{$$idmap{$id}}; } else { $correction += $wmax * $self->{parser__}{words__}{$$idmap{$id}}; } } # Now sort the scores to find the highest and return that bucket # as the classification my @ranking = sort {$score{$b} <=> $score{$a}} keys %score; my %raw_score; my $base_score = $score{$ranking[0]}; my $total = 0; # If the first and second bucket are too close in their # probabilities, call the message unclassified. Also if there are # fewer than 2 buckets. my $class = 'unclassified'; if ( @buckets > 1 && $score{$ranking[0]} > ( $score{$ranking[1]} + $self->{unclassified__} ) ) { $class = $ranking[0]; } # Compute the total of all the scores to generate the normalized # scores and probability estimate. $total is always 1 after the # first loop iteration, so any additional term less than 2 ** -54 # is insignificant, and need not be computed. my $ln2p_54 = -54 * log(2); foreach my $b (@ranking) { $raw_score{$b} = $score{$b}; $score{$b} -= $base_score; $total += exp($score{$b}) if ($score{$b} > $ln2p_54 ); } if ($self->{wordscores__} && defined($templ) ) { my %qm = %{$self->{parser__}->quickmagnets()}; my $mlen = scalar(keys %{$self->{parser__}->quickmagnets()}); if ( $mlen >= 0 ) { $templ->param( 'View_QuickMagnets_If' => 1 ); $templ->param( 'View_QuickMagnets_Count' => ($mlen + 1) ); my @buckets = $self->get_buckets( $session ); my $i = 0; my %types = $self->get_magnet_types( $session ); my @bucket_data; foreach my $bucket (@buckets) { my %row_data; $row_data{View_QuickMagnets_Bucket} = $bucket; $row_data{View_QuickMagnets_Bucket_Color} = $self->get_bucket_color( $session, $bucket ); push ( @bucket_data, \%row_data ); } my @qm_data; foreach my $type (sort keys %types) { my %row_data; if (defined $qm{$type}) { $i++; $row_data{View_QuickMagnets_Type} = $type; $row_data{View_QuickMagnets_I} = $i; $row_data{View_QuickMagnets_Loop_Buckets} = \@bucket_data; my @magnet_data; foreach my $magnet ( @{$qm{$type}} ) { my %row_magnet; $row_magnet{View_QuickMagnets_Magnet} = $magnet; push ( @magnet_data, \%row_magnet ); } $row_data{View_QuickMagnets_Loop_Magnets} = \@magnet_data; push ( @qm_data, \%row_data ); } } $templ->param( 'View_QuickMagnets_Loop' => \@qm_data ); } $templ->param( 'View_Score_If_Score' => $self->{wmformat__} eq 'score' ); my $log10 = log(10.0); my @score_data; foreach my $b (@ranking) { my %row_data; my $prob = exp($score{$b})/$total; my $probstr; my $rawstr; # If the computed probability would display as 1, display # it as .999999 instead. We don't want to give the # impression that POPFile is ever completely sure of its # classification. if ($prob >= .999999) { $probstr = sprintf("%12.6f", 0.999999); } else { if ($prob >= 0.1 || $prob == 0.0) { $probstr = sprintf("%12.6f", $prob); } else { $probstr = sprintf("%17.6e", $prob); } } my $color = $self->get_bucket_color( $session, $b ); $row_data{View_Score_Bucket} = $b; $row_data{View_Score_Bucket_Color} = $color; $row_data{View_Score_MatchCount} = $matchcount{$b}; $row_data{View_Score_ProbStr} = $probstr; if ($self->{wmformat__} eq 'score') { $row_data{View_Score_If_Score} = 1; $rawstr = sprintf("%12.6f", ($raw_score{$b} - $correction)/$log10); $row_data{View_Score_RawStr} = $rawstr; } push ( @score_data, \%row_data ); } $templ->param( 'View_Score_Loop_Scores' => \@score_data ); if ( $self->{wmformat__} ne '' ) { $templ->param( 'View_Score_If_Table' => 1 ); my @header_data; foreach my $ix (0..($#buckets > 7? 7: $#buckets)) { my %row_data; my $bucket = $ranking[$ix]; my $bucketcolor = $self->get_bucket_color( $session, $bucket ); $row_data{View_Score_Bucket} = $bucket; $row_data{View_Score_Bucket_Color} = $bucketcolor; push ( @header_data, \%row_data ); } $templ->param( 'View_Score_Loop_Bucket_Header' => \@header_data ); my %wordprobs; # If the word matrix is supposed to show probabilities, # compute them, saving the results in %wordprobs. if ( $self->{wmformat__} eq 'prob') { foreach my $id (@id_list) { my $sumfreq = 0; my %wval; foreach my $bucket (@ranking) { $wval{$bucket} = $$matrix{$id}{$bucket} || 0; $sumfreq += $wval{$bucket}; } # If $sumfreq is still zero then this word didn't # appear in any buckets so we shouldn't create # wordprobs entries for it if ( $sumfreq != 0 ) { foreach my $bucket (@ranking) { $wordprobs{$bucket,$id} = $wval{$bucket} / $sumfreq; } } } } my @ranked_ids; if ($self->{wmformat__} eq 'prob') { @ranked_ids = sort {($wordprobs{$ranking[0],$b}||0) <=> ($wordprobs{$ranking[0],$a}||0)} @id_list; } else { @ranked_ids = sort {($$matrix{$b}{$ranking[0]}||0) <=> ($$matrix{$a}{$ranking[0]}||0)} @id_list; } my @word_data; my %chart; foreach my $id (@ranked_ids) { my %row_data; my $known = 0; foreach my $bucket (@ranking) { if ( defined( $$matrix{$id}{$bucket} ) ) { $known = 1; last; } } if ( $known == 1 ) { my $wordcolor = $self->get_bucket_color( $session, $self->get_top_bucket__( $userid, $id, $matrix, \@ranking ) ); my $count = $self->{parser__}->{words__}{$$idmap{$id}}; $row_data{View_Score_Word} = $$idmap{$id}; $row_data{View_Score_Word_Color} = $wordcolor; $row_data{View_Score_Word_Count} = $count; my $base_probability = 0; if ( defined($$matrix{$id}{$ranking[0]}) && ( $$matrix{$id}{$ranking[0]} > 0 ) ) { $base_probability = log( $$matrix{$id}{$ranking[0]} / $self->{db_bucketcount__}{$userid}{$ranking[0]} ); } my @per_bucket; my @score; foreach my $ix (0..($#buckets > 7? 7: $#buckets)) { my %bucket_row; my $bucket = $ranking[$ix]; my $probability = 0; if ( defined($$matrix{$id}{$bucket}) && ( $$matrix{$id}{$bucket} > 0 ) ) { $probability = log( $$matrix{$id}{$bucket} / $self->{db_bucketcount__}{$userid}{$bucket} ); } my $color = 'black'; if ( $probability >= $base_probability || $base_probability == 0 ) { $color = $self->get_bucket_color( $session, $bucket ); } $bucket_row{View_Score_If_Probability} = ( $probability != 0 ); $bucket_row{View_Score_Word_Color} = $color; if ( $probability != 0 ) { my $wordprobstr; if ($self->{wmformat__} eq 'score') { $wordprobstr = sprintf("%12.4f", ($probability - $self->{not_likely__}{$userid})/$log10 ); push ( @score, $wordprobstr ); } else { if ($self->{wmformat__} eq 'prob') { $wordprobstr = sprintf("%12.4f", $wordprobs{$bucket,$id}); } else { $wordprobstr = sprintf("%13.5f", exp($probability) ); } } $bucket_row{View_Score_Probability} = $wordprobstr; } else { # Scores eq 0 must also be remembered. push @score, 0; } push ( @per_bucket, \%bucket_row ); } $row_data{View_Score_Loop_Per_Bucket} = \@per_bucket; # If we are doing the word scores then we build up # a hash that maps the name of a word to a value # which is the difference between the word scores # for the top two buckets. We later use this to # draw a chart if ( $self->{wmformat__} eq 'score' ) { $chart{$$idmap{$id}} = ( $score[0] || 0 ) - ( $score[1] || 0 ); } push ( @word_data, \%row_data ); } } $templ->param( 'View_Score_Loop_Words' => \@word_data ); if ( $self->{wmformat__} eq 'score' ) { # Draw a chart that shows how the decision between the top # two buckets was made. my @words = sort { $chart{$b} <=> $chart{$a} } keys %chart; my @chart_data; my $max_chart = $chart{$words[0]}; my $min_chart = $chart{$words[$#words]}; my $scale = ( $max_chart > $min_chart ) ? 400 / ( $max_chart - $min_chart ) : 0; my $color_1 = $self->get_bucket_color( $session, $ranking[0] ); my $color_2 = $self->get_bucket_color( $session, $ranking[1] ); $templ->param( 'Bucket_1' => $ranking[0] ); $templ->param( 'Bucket_2' => $ranking[1] ); $templ->param( 'Color_Bucket_1' => $color_1 ); $templ->param( 'Color_Bucket_2' => $color_2 ); $templ->param( 'Score_Bucket_1' => sprintf("%.3f", ($raw_score{$ranking[0]} - $correction)/$log10) ); $templ->param( 'Score_Bucket_2' => sprintf("%.3f", ($raw_score{$ranking[1]} - $correction)/$log10) ); for ( my $i=0; $i <= $#words; $i++ ) { my $word_1 = $words[$i]; my $word_2 = $words[$#words - $i]; my $width_1 = int( $chart{$word_1} * $scale + .5 ); my $width_2 = int( $chart{$word_2} * $scale - .5 ) * -1; last if ( $width_1 <=0 && $width_2 <= 0 ); my %row_data; $row_data{View_Chart_Word_1} = $word_1; if ( $width_1 > 0 ) { $row_data{View_If_Bar_1} = 1; $row_data{View_Width_1} = $width_1; $row_data{View_Color_1} = $color_1; $row_data{Score_Word_1} = sprintf "%.3f", $chart{$word_1}; } else { $row_data{View_If_Bar_1} = 0; } $row_data{View_Chart_Word_2} = $word_2; if ( $width_2 > 0 ) { $row_data{View_If_Bar_2} = 1; $row_data{View_Width_2} = $width_2; $row_data{View_Color_2} = $color_2; $row_data{Score_Word_2} = sprintf "%.3f", $chart{$word_2}; } else { $row_data{View_If_Bar_2} = 0; } push ( @chart_data, \%row_data ); } $templ->param( 'View_Loop_Chart' => \@chart_data ); $templ->param( 'If_chart' => 1 ); } else { $templ->param( 'If_chart' => 0 ); } } } return $class; } #---------------------------------------------------------------------------- # # classify_and_modify # # This method reads an email terminated by . on a line by itself (or # the end of stream) from a handle and creates an entry in the # history, outputting the same email on another handle with the # appropriate header modifications and insertions # # $session - A valid session key returned by a call to get_session_key # $mail - an open stream to read the email from # $client - an open stream to write the modified email to # $nosave - set to 1 indicates that this should not save to history # $class - if we already know the classification # $slot - Must be defined if $class is set # $echo - 1 to echo to the client, 0 to supress, defaults to 1 # $crlf - The sequence to use at the end of a line in the output, # normally this is left undefined and this method uses $eol (the # normal network end of line), but if this method is being used with # real files you may wish to pass in \n instead # # Returns a classification if it worked and the slot ID of the history # item related to this classification # # IMPORTANT NOTE: $mail and $client should be binmode # #---------------------------------------------------------------------------- sub classify_and_modify { my ( $self, $session, $mail, $client, $nosave, $class, $slot, $echo, $crlf ) = @_; $echo = 1 unless (defined $echo); $crlf = $eol unless (defined $crlf); my $msg_subject; # The message subject my $msg_head_before = ''; # Store the message headers that # come before Subject here my $msg_head_after = ''; # Store the message headers that # come after Subject here my $msg_head_q = ''; # Store questionable header lines here my $msg_body = ''; # Store the message body here my $in_subject_header = 0; # 1 if in Subject header # These two variables are used to control the insertion of the # X-POPFile-TimeoutPrevention header when downloading long or slow # emails my $last_timeout = time; my $timeout_count = 0; # Indicates whether the first time through the receive loop we got # the full body, this will happen on small emails my $got_full_body = 0; # The size of the message downloaded so far. my $message_size = 0; # The classification for this message my $classification = ''; # Whether we are currently reading the mail headers or not my $getting_headers = 1; my $msg_file; # If we don't yet know the classification then start the parser $class = '' if ( !defined( $class ) ); if ( $class eq '' ) { $self->{parser__}->start_parse(); ( $slot, $msg_file ) = $self->{history__}->reserve_slot(); } else { $msg_file = $self->{history__}->get_slot_file( $slot ); } # We append .TMP to the filename for the MSG file so that if we are in # middle of downloading a message and we refresh the history we do not # get class file errors open MSG, ">$msg_file" unless $nosave; while ( my $line = $self->slurp_( $mail ) ) { my $fileline; # This is done so that we remove the network style end of line # CR LF and allow Perl to decide on the local system EOL which # it will expand out of \n when this gets written to the temp # file $fileline = $line; $fileline =~ s/[\r\n]//g; $fileline .= "\n"; # Check for an abort last if ( $self->{alive_} == 0 ); # The termination of a message is a line consisting of exactly # .CRLF so we detect that here exactly if ( $line =~ /^\.(\r\n|\r|\n)$/ ) { $got_full_body = 1; last; } if ( $getting_headers ) { # Kill header lines containing only whitespace (Exim does this) next if ( $line =~ /^[ \t]+(\r\n|\r|\n)$/i ); if ( !( $line =~ /^(\r\n|\r|\n)$/i ) ) { $message_size += length $line; $self->write_line__( $nosave?undef:\*MSG, $fileline, $class ); # If there is no echoing occuring, it doesn't matter # what we do to these if ( $echo ) { if ( $line =~ /^Subject:(.*)/i ) { $msg_subject = $1; $msg_subject =~ s/(\012|\015)//g; $in_subject_header = 1; next; } elsif ( $line !~ /^[ \t]/ ) { $in_subject_header = 0; } # Strip out the X-Text-Classification header that # is in an incoming message next if ( $line =~ /^X-Text-Classification:/i ); next if ( $line =~ /^X-POPFile-Link:/i ); # Store any lines that appear as though they may # be non-header content Lines that are headers # begin with whitespace or Alphanumerics and "-" # followed by a colon. # # This prevents weird things like HTML before the # headers terminate from causing the XPL and XTC # headers to be inserted in places some clients # can't detect if ( ( $line =~ /^[ \t]/ ) && $in_subject_header ) { $line =~ s/(\012|\015)//g; $msg_subject .= $crlf . $line; next; } if ( $line =~ /^([ \t]|([A-Z0-9\-_]+:))/i ) { if ( !defined($msg_subject) ) { $msg_head_before .= $msg_head_q . $line; } else { $msg_head_after .= $msg_head_q . $line; } $msg_head_q = ''; } else { # Gather up any header lines that are questionable $self->log_( 1, "Found odd email header: $line" ); $msg_head_q .= $line; } } } else { $self->write_line__( $nosave?undef:\*MSG, "\n", $class ); $message_size += length $crlf; $getting_headers = 0; } } else { $message_size += length $line; $msg_body .= $line; $self->write_line__( $nosave?undef:\*MSG, $fileline, $class ); } # Check to see if too much time has passed and we need to keep # the mail client happy if ( time > ( $last_timeout + 2 ) ) { print $client "X-POPFile-TimeoutPrevention: $timeout_count$crlf" if ( $echo ); $timeout_count += 1; $last_timeout = time; } last if ( ( $message_size > $self->global_config_( 'message_cutoff' ) ) && ( $getting_headers == 0 ) ); } close MSG unless $nosave; # If we don't yet know the classification then stop the parser if ( $class eq '' ) { $self->{parser__}->stop_parse(); } # Do the text classification and update the counter for that # bucket that we just downloaded an email of that type $classification = ($class ne '')?$class:$self->classify( $session, undef); my $subject_modification = $self->get_bucket_parameter( $session, $classification, 'subject' ); my $xtc_insertion = $self->get_bucket_parameter( $session, $classification, 'xtc' ); my $xpl_insertion = $self->get_bucket_parameter( $session, $classification, 'xpl' ); my $quarantine = $self->get_bucket_parameter( $session, $classification, 'quarantine' ); my $modification = $self->config_( 'subject_mod_left' ) . $classification . $self->config_( 'subject_mod_right' ); # Add the Subject line modification or the original line back again # Don't add the classification unless it is not present if ( ( defined( $msg_subject ) && ( $msg_subject !~ /\Q$modification\E/ ) ) && # PROFILE BLOCK START ( $subject_modification == 1 ) && ( $quarantine == 0 ) ) { # PROFILE BLOCK STOP $msg_subject = " $modification$msg_subject"; } if ( !defined( $msg_subject ) && # PROFILE BLOCK START ( $subject_modification == 1 ) && ( $quarantine == 0 ) ) { # PROFILE BLOCK STOP $msg_subject = " $modification"; } $msg_subject = '' if ( !defined( $msg_subject ) ); $msg_head_before .= 'Subject:' . $msg_subject; $msg_head_before .= $crlf; # Add the XTC header $msg_head_after .= "X-Text-Classification: $classification$crlf" if ( ( $xtc_insertion ) && # PROFILE BLOCK START ( $quarantine == 0 ) ); # PROFILE BLOCK STOP # Add the XPL header my $xpl = $self->config_( 'xpl_angle' )?'<':''; $xpl .= "http://"; $xpl .= $self->module_config_( 'html', 'local' )?"127.0.0.1":$self->config_( 'hostname' ); $xpl .= ":" . $self->module_config_( 'html', 'port' ) . "/jump_to_message?view=$slot"; if ( $self->config_( 'xpl_angle' ) ) { $xpl .= '>'; } $xpl .= "$crlf"; if ( $xpl_insertion && ( $quarantine == 0 ) ) { $msg_head_after .= 'X-POPFile-Link: ' . $xpl; } $msg_head_after .= $msg_head_q . "$crlf"; # Echo the text of the message to the client if ( $echo ) { # If the bucket is quarantined then we'll treat it specially # by changing the message header to contain information from # POPFile and wrapping the original message in a MIME encoding if ( $quarantine == 1 ) { my ( $orig_from, $orig_to, $orig_subject ) = ( $self->{parser__}->get_header('from'), $self->{parser__}->get_header('to'), $self->{parser__}->get_header('subject') ); my ( $encoded_from, $encoded_to ) = ( $orig_from, $orig_to ); if ( $self->{parser__}->{lang__} eq 'Nihongo' ) { require Encode; Encode::from_to( $orig_from, 'euc-jp', 'iso-2022-jp'); Encode::from_to( $orig_to, 'euc-jp', 'iso-2022-jp'); Encode::from_to( $orig_subject, 'euc-jp', 'iso-2022-jp'); $encoded_from = $orig_from; $encoded_to = $orig_to; $encoded_from =~ s/(\x1B\x24\x42.+\x1B\x28\x42)/"=?ISO-2022-JP?B?" . encode_base64($1,'') . "?="/eg; $encoded_to =~ s/(\x1B\x24\x42.+\x1B\x28\x42)/"=?ISO-2022-JP?B?" . encode_base64($1,'') . "?="/eg; } print $client "From: $encoded_from$crlf"; print $client "To: $encoded_to$crlf"; print $client "Date: " . $self->{parser__}->get_header( 'date' ) . "$crlf"; # Don't add the classification unless it is not present if ( ( defined( $msg_subject ) && ( $msg_subject !~ /\[\Q$classification\E\]/ ) ) && # PROFILE BLOCK START ( $subject_modification == 1 ) ) { # PROFILE BLOCK STOP $msg_subject = " $modification$msg_subject"; } print $client "Subject:$msg_subject$crlf"; print $client "X-Text-Classification: $classification$crlf" if ( $xtc_insertion ); print $client 'X-POPFile-Link: ' . $xpl if ( $xpl_insertion ); print $client "MIME-Version: 1.0$crlf"; print $client "Content-Type: multipart/report; boundary=\"$slot\"$crlf$crlf--$slot$crlf"; print $client "Content-Type: text/plain"; print $client "; charset=iso-2022-jp" if ( $self->{parser__}->{lang__} eq 'Nihongo' ); print $client "$crlf$crlf"; print $client "POPFile has quarantined a message. It is attached to this email.$crlf$crlf"; print $client "Quarantined Message Detail$crlf$crlf"; print $client "Original From: $orig_from$crlf"; print $client "Original To: $orig_to$crlf"; print $client "Original Subject: $orig_subject$crlf"; print $client "To examine the email open the attachment. "; print $client "To change this mail's classification go to $xpl"; print $client "$crlf"; print $client "The first 20 words found in the email are:$crlf$crlf"; my $first20 = $self->{parser__}->first20(); if ( $self->{parser__}->{lang__} eq 'Nihongo' ) { require Encode; Encode::from_to( $first20, 'euc-jp', 'iso-2022-jp'); } print $client $first20; print $client "$crlf--$slot$crlf"; print $client "Content-Type: message/rfc822$crlf$crlf"; } print $client $msg_head_before; print $client $msg_head_after; print $client $msg_body; } my $before_dot = ''; if ( $quarantine && $echo ) { $before_dot = "$crlf--$slot--$crlf"; } my $need_dot = 0; if ( $got_full_body ) { $need_dot = 1; } else { $need_dot = !$self->echo_to_dot_( $mail, $echo?$client:undef, $nosave?undef:'>>' . $msg_file, $before_dot ) && !$nosave; } if ( $need_dot ) { print $client $before_dot if ( $before_dot ne '' ); print $client ".$crlf" if ( $echo ); } # In some cases it's possible (and totally illegal) to get a . in # the middle of the message, to cope with the we call flush_extra_ # here to remove any extra stuff the POP3 server is sending Make # sure to supress output if we are not echoing, and to save to # file if not echoing and saving if ( !($nosave || $echo) ) { # if we're saving (not nosave) and not echoing, we can safely # unload this into the temp file if (open FLUSH, ">$msg_file.flush") { binmode FLUSH; # TODO: Do this in a faster way (without flushing to one # file then copying to another) (perhaps a select on $mail # to predict if there is flushable data) $self->flush_extra_( $mail, \*FLUSH, 0); close FLUSH; # append any data we got to the actual temp file if ( ( (-s "$msg_file.flush") > 0 ) && ( open FLUSH, "<$msg_file.flush" ) ) { binmode FLUSH; if ( open TEMP, ">>$msg_file" ) { binmode TEMP; # The only time we get data here is if it is after # a CRLF.CRLF We have to re-create it to avoid # data-loss print TEMP ".$crlf"; print TEMP $_ while (<FLUSH>); # NOTE: The last line flushed MAY be a CRLF.CRLF, # which isn't actually part of the message body close TEMP; } close FLUSH; } unlink("$msg_file.flush"); } } else { # if we are echoing, the client can make sure we have no data # loss otherwise, the data can be discarded (not saved and not # echoed) $self->flush_extra_( $mail, $client, $echo?0:1); } if ( $class eq '' ) { if ( $nosave ) { $self->{history__}->release_slot( $slot ); } else { $self->{history__}->commit_slot( $session, $slot, $classification, $self->{magnet_detail__} ); } } return ( $classification, $slot, $self->{magnet_used__} ); } #---------------------------------------------------------------------------- # # get_buckets # # Returns a list containing all the real bucket names sorted into # alphabetic order # # $session A valid session key returned by a call to get_session_key # #---------------------------------------------------------------------------- sub get_buckets { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # Note that get_buckets does not return pseudo buckets my @buckets; for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) { if ( $self->{db_bucketid__}{$userid}{$b}{pseudo} == 0 ) { push @buckets, ($b); } } return @buckets; } #---------------------------------------------------------------------------- # # get_bucket_id # # Returns the internal ID for a bucket for database calls # # $session A valid session key returned by a call to get_session_key # $bucket The bucket name # #---------------------------------------------------------------------------- sub get_bucket_id { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); return $self->{db_bucketid__}{$userid}{$bucket}{id}; } #---------------------------------------------------------------------------- # # get_bucket_name # # Returns the name of a bucket from an internal ID # # $session A valid session key returned by a call to get_session_key # $id The bucket id # #---------------------------------------------------------------------------- sub get_bucket_name { my ( $self, $session, $id ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); foreach $b (keys %{$self->{db_bucketid__}{$userid}}) { if ( $id == $self->{db_bucketid__}{$userid}{$b}{id} ) { return $b; } } return ''; } #---------------------------------------------------------------------------- # # get_pseudo_buckets # # Returns a list containing all the pseudo bucket names sorted into # alphabetic order # # $session A valid session key returned by a call to get_session_key # #---------------------------------------------------------------------------- sub get_pseudo_buckets { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my @buckets; for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) { if ( $self->{db_bucketid__}{$userid}{$b}{pseudo} == 1 ) { push @buckets, ($b); } } return @buckets; } #---------------------------------------------------------------------------- # # get_all_buckets # # Returns a list containing all the bucket names sorted into # alphabetic order # # $session A valid session key returned by a call to get_session_key # #---------------------------------------------------------------------------- sub get_all_buckets { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my @buckets; for my $b (sort keys %{$self->{db_bucketid__}{$userid}}) { push @buckets, ($b); } return @buckets; } #---------------------------------------------------------------------------- # # is_pseudo_bucket # # Returns 1 if the named bucket is pseudo # # $session A valid session key returned by a call to get_session_key # $bucket The bucket to check # #---------------------------------------------------------------------------- sub is_pseudo_bucket { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); return ( defined($self->{db_bucketid__}{$userid}{$bucket}) # PROFILE BLOCK START && $self->{db_bucketid__}{$userid}{$bucket}{pseudo} ); # PROFILE BLOCK STOP } #---------------------------------------------------------------------------- # # is_bucket # # Returns 1 if the named bucket is a bucket # # $session A valid session key returned by a call to get_session_key # $bucket The bucket to check # #---------------------------------------------------------------------------- sub is_bucket { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); return ( ( defined( $self->{db_bucketid__}{$userid}{$bucket} ) ) && # PROFILE BLOCK START ( !$self->{db_bucketid__}{$userid}{$bucket}{pseudo} ) ); # PROFILE BLOCK STOP } #---------------------------------------------------------------------------- # # get_bucket_word_count # # Returns the total word count (including duplicates) for the passed in bucket # # $session A valid session key returned by a call to get_session_key # $bucket The name of the bucket for which the word count is desired # #---------------------------------------------------------------------------- sub get_bucket_word_count { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my $c = $self->{db_bucketcount__}{$userid}{$bucket}; return defined($c)?$c:0; } #---------------------------------------------------------------------------- # # get_bucket_word_list # # Returns a list of words all with the same first character # # $session A valid session key returned by a call to get_session_key # $bucket The name of the bucket for which the word count is desired # $prefix The first character of the words # #---------------------------------------------------------------------------- sub get_bucket_word_list { my ( $self, $session, $bucket, $prefix ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; my $result = $self->{db__}->selectcol_arrayref( # PROFILE BLOCK START "select words.word from matrix, words where matrix.wordid = words.id and matrix.bucketid = $bucketid and words.word like '$prefix%';"); # PROFILE BLOCK STOP return @{$result}; } #---------------------------------------------------------------------------- # # get_bucket_word_prefixes # # Returns a list of all the initial letters of words in a bucket # # $session A valid session key returned by a call to get_session_key # $bucket The name of the bucket for which the word count is desired # #---------------------------------------------------------------------------- sub get_bucket_word_prefixes { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my $prev = ''; my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; my $result = $self->{db__}->selectcol_arrayref( # PROFILE BLOCK START "select words.word from matrix, words where matrix.wordid = words.id and matrix.bucketid = $bucketid;"); # PROFILE BLOCK STOP # In Japanese mode, disable locale and use substr_euc, the substr # function which supports EUC Japanese charset. Sorting Japanese # with "use locale" is memory and time consuming, and may cause # perl crash. if ( $self->module_config_( 'html', 'language' ) eq 'Nihongo' ) { return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr_euc__($_,0,1)} @{$result}; } else { if ( $self->module_config_( 'html', 'language' ) eq 'Korean' ) { return grep {$_ ne $prev && ($prev = $_, 1)} sort map {$_ =~ /([\x20-\x80]|$eksc)/} @{$result}; } else { return grep {$_ ne $prev && ($prev = $_, 1)} sort map {substr($_,0,1)} @{$result}; } } } #---------------------------------------------------------------------------- # # get_word_count # # Returns the total word count (including duplicates) # # $session A valid session key returned by a call to get_session_key # #---------------------------------------------------------------------------- sub get_word_count { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $self->{db_get_full_total__}->execute( $userid ); return $self->{db_get_full_total__}->fetchrow_arrayref->[0]; } #---------------------------------------------------------------------------- # # get_count_for_word # # Returns the number of times the word occurs in a bucket # # $session A valid session key returned by a call to get_session_key # $bucket The bucket we are asking about # $word The word we are asking about # #---------------------------------------------------------------------------- sub get_count_for_word { my ( $self, $session, $bucket, $word ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); return $self->get_base_value_( $session, $bucket, $word ); } #---------------------------------------------------------------------------- # # get_bucket_unique_count # # Returns the unique word count (excluding duplicates) for the passed # in bucket # # $session A valid session key returned by a call to get_session_key # $bucket The name of the bucket for which the word count is desired # #---------------------------------------------------------------------------- sub get_bucket_unique_count { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my $c = $self->{db_bucketunique__}{$userid}{$bucket}; return defined($c)?$c:0; } #---------------------------------------------------------------------------- # # get_unique_word_count # # Returns the unique word count (excluding duplicates) for all buckets # # $session A valid session key returned by a call to get_session_key # #---------------------------------------------------------------------------- sub get_unique_word_count { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $self->{db_get_unique_word_count__}->execute( $userid ); return $self->{db_get_unique_word_count__}->fetchrow_arrayref->[0]; } #---------------------------------------------------------------------------- # # get_bucket_color # # Returns the color associated with a bucket # # $session A valid session key returned by a call to get_session_key # $bucket The name of the bucket for which the color is requested # # NOTE This API is DEPRECATED in favor of calling get_bucket_parameter for # the parameter named 'color' #---------------------------------------------------------------------------- sub get_bucket_color { my ( $self, $session, $bucket ) = @_; return $self->get_bucket_parameter( $session, $bucket, 'color' ); } #---------------------------------------------------------------------------- # # set_bucket_color # # Returns the color associated with a bucket # # $session A valid session key returned by a call to get_session_key # $bucket The name of the bucket for which the color is requested # $color The new color # # NOTE This API is DEPRECATED in favor of calling set_bucket_parameter for # the parameter named 'color' #---------------------------------------------------------------------------- sub set_bucket_color { my ( $self, $session, $bucket, $color ) = @_; return $self->set_bucket_parameter( $session, $bucket, 'color', $color ); } #---------------------------------------------------------------------------- # # get_bucket_parameter # # Returns the value of a per bucket parameter # # $session A valid session key returned by a call to get_session_key # $bucket The name of the bucket # $parameter The name of the parameter # #---------------------------------------------------------------------------- sub get_bucket_parameter { my ( $self, $session, $bucket, $parameter ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # See if there's a cached value if ( defined( $self->{db_parameters__}{$userid}{$bucket}{$parameter} ) ) { return $self->{db_parameters__}{$userid}{$bucket}{$parameter}; } # Make sure that the bucket passed in actually exists if ( !defined( $self->{db_bucketid__}{$userid}{$bucket} ) ) { return undef; } # If there is a non-default value for this parameter then return it. $self->{db_get_bucket_parameter__}->execute( $self->{db_bucketid__}{$userid}{$bucket}{id}, $self->{db_parameterid__}{$parameter} ); my $result = $self->{db_get_bucket_parameter__}->fetchrow_arrayref; # If this parameter has not been defined for this specific bucket then # get the default value if ( !defined( $result ) ) { $self->{db_get_bucket_parameter_default__}->execute( # PROFILE BLOCK START $self->{db_parameterid__}{$parameter} ); # PROFILE BLOCK STOP $result = $self->{db_get_bucket_parameter_default__}->fetchrow_arrayref; } if ( defined( $result ) ) { $self->{db_parameters__}{$userid}{$bucket}{$parameter} = $result->[0]; return $result->[0]; } else { return undef; } } #---------------------------------------------------------------------------- # # set_bucket_parameter # # Sets the value associated with a bucket specific parameter # # $session A valid session key returned by a call to get_session_key # $bucket The name of the bucket # $parameter The name of the parameter # $value The new value # #---------------------------------------------------------------------------- sub set_bucket_parameter { my ( $self, $session, $bucket, $parameter, $value ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # Make sure that the bucket passed in actually exists if ( !defined( $self->{db_bucketid__}{$userid}{$bucket} ) ) { return undef; } my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; my $btid = $self->{db_parameterid__}{$parameter}; # Exactly one row should be affected by this statement $self->{db_set_bucket_parameter__}->execute( $bucketid, $btid, $value ); if ( defined( $self->{db_parameters__}{$userid}{$bucket}{$parameter} ) ) { $self->{db_parameters__}{$userid}{$bucket}{$parameter} = $value; } return 1; } #---------------------------------------------------------------------------- # # get_html_colored_message # # Parser a mail message stored in a file and returns HTML representing # the message with coloring of the words # # $session A valid session key returned by a call to get_session_key # $file The file to parse # #---------------------------------------------------------------------------- sub get_html_colored_message { my ( $self, $session, $file ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $self->{parser__}->{color__} = $session; $self->{parser__}->{color_matrix__} = undef; $self->{parser__}->{color_idmap__} = undef; $self->{parser__}->{color_userid__} = undef; $self->{parser__}->{bayes__} = bless $self; my $result = $self->{parser__}->parse_file( $file, # PROFILE BLOCK START $self->global_config_( 'message_cutoff' ) ); # PROFILE BLOCK STOP $self->{parser__}->{color__} = ''; return $result; } #---------------------------------------------------------------------------- # # fast_get_html_colored_message # # Parser a mail message stored in a file and returns HTML representing the message # with coloring of the words # # $session A valid session key returned by a call to get_session_key # $file The file to colorize # $matrix Reference to the matrix hash from a call to classify # $idmap Reference to the idmap hash from a call to classify # #---------------------------------------------------------------------------- sub fast_get_html_colored_message { my ( $self, $session, $file, $matrix, $idmap ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $self->{parser__}->{color__} = $session; $self->{parser__}->{color_matrix__} = $matrix; $self->{parser__}->{color_idmap__} = $idmap; $self->{parser__}->{color_userid__} = $userid; $self->{parser__}->{bayes__} = bless $self; my $result = $self->{parser__}->parse_file( $file, $self->global_config_( 'message_cutoff' ) ); $self->{parser__}->{color__} = ''; return $result; } #---------------------------------------------------------------------------- # # create_bucket # # Creates a new bucket, returns 1 if the creation succeeded # # $session A valid session key returned by a call to get_session_key # $bucket Name for the new bucket # #---------------------------------------------------------------------------- sub create_bucket { my ( $self, $session, $bucket ) = @_; if ( $self->is_bucket( $session, $bucket ) || # PROFILE BLOCK START $self->is_pseudo_bucket( $session, $bucket ) ) { # PROFILE BLOCK STOP return 0; } my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $bucket = $self->{db__}->quote( $bucket ); $self->{db__}->do( # PROFILE BLOCK START "insert into buckets ( name, pseudo, userid ) values ( $bucket, 0, $userid );" ); # PROFILE BLOCK STOP $self->db_update_cache__( $session ); return 1; } #---------------------------------------------------------------------------- # # delete_bucket # # Deletes a bucket, returns 1 if the delete succeeded # # $session A valid session key returned by a call to get_session_key # $bucket Name of the bucket to delete # #---------------------------------------------------------------------------- sub delete_bucket { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # Make sure that the bucket passed in actually exists if ( !defined( $self->{db_bucketid__}{$userid}{$bucket} ) ) { return 0; } $self->{db__}->do( # PROFILE BLOCK START "delete from buckets where buckets.userid = $userid and buckets.name = '$bucket';" ); # PROFILE BLOCK STOP $self->db_update_cache__( $session ); return 1; } #---------------------------------------------------------------------------- # # rename_bucket # # Renames a bucket, returns 1 if the rename succeeded # # $session A valid session key returned by a call to get_session_key # $old_bucket The old name of the bucket # $new_bucket The new name of the bucket # #---------------------------------------------------------------------------- sub rename_bucket { my ( $self, $session, $old_bucket, $new_bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # Make sure that the bucket passed in actually exists if ( !defined( $self->{db_bucketid__}{$userid}{$old_bucket} ) ) { $self->log_( 0, "Bad bucket name $old_bucket to rename_bucket" ); return 0; } my $id = $self->{db__}->quote( $self->{db_bucketid__}{$userid}{$old_bucket}{id} ); $new_bucket = $self->{db__}->quote( $new_bucket ); $self->log_( 1, "Rename bucket $old_bucket to $new_bucket" ); my $result = $self->{db__}->do( "update buckets set name = $new_bucket where id = $id;" ); if ( !defined( $result ) || ( $result == -1 ) ) { return 0; } else { $self->db_update_cache__( $session ); return 1; } } #---------------------------------------------------------------------------- # # add_messages_to_bucket # # Parses mail messages and updates the statistics in the specified bucket # # $session A valid session key returned by a call to get_session_key # $bucket Name of the bucket to be updated # @files List of file names to parse # #---------------------------------------------------------------------------- sub add_messages_to_bucket { my ( $self, $session, $bucket, @files ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); if ( !defined( $self->{db_bucketid__}{$userid}{$bucket}{id} ) ) { return 0; } # This is done to clear out the word list because in the loop # below we are going to not reset the word list on each parse $self->{parser__}->start_parse(); $self->{parser__}->stop_parse(); foreach my $file (@files) { $self->{parser__}->parse_file( $file, # PROFILE BLOCK START $self->global_config_( 'message_cutoff' ), 0 ); # PROFILE BLOCK STOP (Do not reset word list) } $self->add_words_to_bucket__( $session, $bucket, 1 ); $self->db_update_cache__( $session ); return 1; } #---------------------------------------------------------------------------- # # add_message_to_bucket # # Parses a mail message and updates the statistics in the specified bucket # # $session A valid session key returned by a call to get_session_key # $bucket Name of the bucket to be updated # $file Name of file containing mail message to parse # #---------------------------------------------------------------------------- sub add_message_to_bucket { my ( $self, $session, $bucket, $file ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); return $self->add_messages_to_bucket( $session, $bucket, $file ); } #---------------------------------------------------------------------------- # # remove_message_from_bucket # # Parses a mail message and updates the statistics in the specified bucket # # $session A valid session key returned by a call to get_session_key # $bucket Name of the bucket to be updated # $file Name of file containing mail message to parse # #---------------------------------------------------------------------------- sub remove_message_from_bucket { my ( $self, $session, $bucket, $file ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); $self->{parser__}->parse_file( $file, # PROFILE BLOCK START $self->global_config_( 'message_cutoff' ) ); # PROFILE BLOCK STOP $self->add_words_to_bucket__( $session, $bucket, -1 ); $self->db_update_cache__( $session ); return 1; } #---------------------------------------------------------------------------- # # get_buckets_with_magnets # # Returns the names of the buckets for which magnets are defined # # $session A valid session key returned by a call to get_session_key # #---------------------------------------------------------------------------- sub get_buckets_with_magnets { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my @result; $self->{db_get_buckets_with_magnets__}->execute( $userid ); while ( my $row = $self->{db_get_buckets_with_magnets__}->fetchrow_arrayref ) { push @result, ($row->[0]); } return @result; } #---------------------------------------------------------------------------- # # get_magnet_types_in_bucket # # Returns the types of the magnets in a specific bucket # # $session A valid session key returned by a call to get_session_key # $bucket The bucket to search for magnets # #---------------------------------------------------------------------------- sub get_magnet_types_in_bucket { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my @result; my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; my $h = $self->{db__}->prepare( "select magnet_types.mtype from magnet_types, magnets, buckets where magnet_types.id = magnets.mtid and magnets.bucketid = buckets.id and buckets.id = $bucketid group by magnet_types.mtype order by magnet_types.mtype;" ); $h->execute; while ( my $row = $h->fetchrow_arrayref ) { push @result, ($row->[0]); } $h->finish; return @result; } #---------------------------------------------------------------------------- # # clear_bucket # # Removes all words from a bucket # # $session A valid session key returned by a call to get_session_key # $bucket The bucket to clear # #---------------------------------------------------------------------------- sub clear_bucket { my ( $self, $session, $bucket ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; $self->{db__}->do( "delete from matrix where matrix.bucketid = $bucketid;" ); $self->db_update_cache__( $session ); } #---------------------------------------------------------------------------- # # clear_magnets # # Removes every magnet currently defined # # $session A valid session key returned by a call to get_session_key # #---------------------------------------------------------------------------- sub clear_magnets { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); for my $bucket (keys %{$self->{db_bucketid__}{$userid}}) { my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; $self->{db__}->do( "delete from magnets where magnets.bucketid = $bucketid" ); } } #---------------------------------------------------------------------------- # # get_magnets # # Returns the magnets of a certain type in a bucket # # $session A valid session key returned by a call to get_session_key # $bucket The bucket to search for magnets # $type The magnet type (e.g. from, to or subject) # #---------------------------------------------------------------------------- sub get_magnets { my ( $self, $session, $bucket, $type ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my @result; my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; my $h = $self->{db__}->prepare( "select magnets.val from magnets, magnet_types where magnets.bucketid = $bucketid and magnets.id != 0 and magnet_types.id = magnets.mtid and magnet_types.mtype = '$type' order by magnets.val;" ); $h->execute; while ( my $row = $h->fetchrow_arrayref ) { push @result, ($row->[0]); } $h->finish; return @result; } #---------------------------------------------------------------------------- # # create_magnet # # Make a new magnet # # $session A valid session key returned by a call to get_session_key # $bucket The bucket the magnet belongs in # $type The magnet type (e.g. from, to or subject) # $text The text of the magnet # #---------------------------------------------------------------------------- sub create_magnet { my ( $self, $session, $bucket, $type, $text ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; my $result = $self->{db__}->selectrow_arrayref("select magnet_types.id from magnet_types where magnet_types.mtype = '$type';" ); my $mtid = $result->[0]; $text = $self->{db__}->quote( $text ); $self->{db__}->do( "insert into magnets ( bucketid, mtid, val ) values ( $bucketid, $mtid, $text );" ); } #---------------------------------------------------------------------------- # # get_magnet_types # # Get a hash mapping magnet types (e.g. from) to magnet names (e.g. From); # # $session A valid session key returned by a call to get_session_key # #---------------------------------------------------------------------------- sub get_magnet_types { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my %result; my $h = $self->{db__}->prepare( "select magnet_types.mtype, magnet_types.header from magnet_types order by mtype;" ); $h->execute; while ( my $row = $h->fetchrow_arrayref ) { $result{$row->[0]} = $row->[1]; } $h->finish; return %result; } #---------------------------------------------------------------------------- # # delete_magnet # # Remove a new magnet # # $session A valid session key returned by a call to get_session_key # $bucket The bucket the magnet belongs in # $type The magnet type (e.g. from, to or subject) # $text The text of the magnet # #---------------------------------------------------------------------------- sub delete_magnet { my ( $self, $session, $bucket, $type, $text ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my $bucketid = $self->{db_bucketid__}{$userid}{$bucket}{id}; my $result = $self->{db__}->selectrow_arrayref("select magnet_types.id from magnet_types where magnet_types.mtype = '$type';" ); my $mtid = $result->[0]; $self->{db__}->do( "delete from magnets where magnets.bucketid = $bucketid and magnets.mtid = $mtid and magnets.val = '$text';" ); } #---------------------------------------------------------------------------- # # get_stopword_list # # Gets the complete list of stop words # # $session A valid session key returned by a call to get_session_key # #---------------------------------------------------------------------------- sub get_stopword_list { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); return $self->{parser__}->{mangle__}->stopwords(); } #---------------------------------------------------------------------------- # # magnet_count # # Gets the number of magnets that are defined # # $session A valid session key returned by a call to get_session_key # #---------------------------------------------------------------------------- sub magnet_count { my ( $self, $session ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); my $result = $self->{db__}->selectrow_arrayref( "select count(*) from magnets, buckets where buckets.userid = $userid and magnets.id != 0 and magnets.bucketid = buckets.id;" ); if ( defined( $result ) ) { return $result->[0]; } else { return 0; } } #---------------------------------------------------------------------------- # # add_stopword, remove_stopword # # Adds or removes a stop word # # $session A valid session key returned by a call to get_session_key # $stopword The word to add or remove # # Return 0 for a bad stop word, and 1 otherwise # #---------------------------------------------------------------------------- sub add_stopword { my ( $self, $session, $stopword ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # Pass language parameter to add_stopword() return $self->{parser__}->{mangle__}->add_stopword( $stopword, $self->module_config_( 'html', 'language' ) ); } sub remove_stopword { my ( $self, $session, $stopword ) = @_; my $userid = $self->valid_session_key__( $session ); return undef if ( !defined( $userid ) ); # Pass language parameter to remove_stopword() return $self->{parser__}->{mangle__}->remove_stopword( $stopword, $self->module_config_( 'html', 'language' ) ); } #---------------------------------------------------------------------------- #---------------------------------------------------------------------------- # _____ _____ _____ _______ _____ _______ _______ _____ _____ #|_____] | | |_____] |______ | | |______ |_____| |_____] | #| |_____| | | __|__ |_____ |______ | | | __|__ # #---------------------------------------------------------------------------- #---------------------------------------------------------------------------- # GETTERS/SETTERS sub wordscores { my ( $self, $value ) = @_; $self->{wordscores__} = $value if (defined $value); return $self->{wordscores__}; } sub wmformat { my ( $self, $value ) = @_; $self->{wmformat__} = $value if (defined $value); return $self->{wmformat__}; } sub db { my ( $self ) = @_; return $self->{db__}; } sub history { my ( $self, $history ) = @_; $self->{history__} = $history; } 1;